{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 1.unicode和utf-8编码\n",
    "**python3里只有unicode编码格式的字节串才能叫作str，其他编码格式的统统都叫bytes，如：gbk，utf-8，gb2312**\n",
    "\n",
    "- `bytes.decode('utf8')` 将bytes解码为unicode字符串\n",
    "\n",
    "- `str.encode('utf8')` 将unicode字符串编码为bytes形式\n",
    "\n",
    "`chardet.detect(bytes_data)`可以检测bytes的编码类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import torch\n",
    "import chardet\n",
    "import unicodedata\n",
    "import numpy as np\n",
    "from torch import nn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.数据类型: <class 'str'>\n",
      "2.编码为utf8类型: b'\\xe5\\x8c\\x97\\xe4\\xba\\xac'\n",
      "3.字符串解码结果: 北京\n",
      "4.bytes数据的编码类型: {'encoding': 'utf-8', 'confidence': 0.7525, 'language': ''}\n"
     ]
    }
   ],
   "source": [
    "s = '北京'\n",
    "utf8_s = s.encode('utf8')              # unicode=>utf-8   'GBK'、'gb2312'、'ascii'\n",
    "str_s  = utf8_s.decode('utf8')      \n",
    "s_type = chardet.detect(utf8_s)\n",
    "print('1.数据类型:',type(s))\n",
    "print('2.编码为utf8类型:', utf8_s)          # b''表示字节数据\n",
    "print('3.字符串解码结果:', str_s)\n",
    "print('4.bytes数据的编码类型:', s_type)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2. sub正则匹配"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Are your parents home ?'"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.sub(r\"([.!?])\", r\" \\1\", 'Are your parents home?')             # 加个空格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Are your parents home ?'"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "re.sub(r\"[^a-zA-Z.!?]+\", r\" \", 'Are your parents home ?')        # 将[]以外的字符换为空格"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('s', 5), ('a', 2), ('b', 1)]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "l = [('a', 2), ('b', 1), ('s', 5)]\n",
    "sorted(l, key=lambda x:x[1], reverse=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "sorted?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2, 1, 64])"
      ]
     },
     "execution_count": 122,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 141,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[8]"
      ]
     },
     "execution_count": 141,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([3, 4, 2])"
      ]
     },
     "execution_count": 160,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "torch.Size([2])"
      ]
     },
     "execution_count": 161,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([2, 3, 4])"
      ]
     },
     "execution_count": 163,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
